In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import dateutil.parser
pandasWe've touched a little bit on time so far - mostly how tragic it is to parse - but pandas can do some neat things with it once you figure out how it works.
Let's open up some data from the Census bureau - we're going to use New Home Sales. The data is formatted... oddly, so I've done the importing and joining for you below.
In [2]:
data_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='data')
data_df.head()
Out[2]:
In [3]:
categories_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='categories')
data_types_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='data_types')
error_types_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='error_types')
geo_levels_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='geo_levels')
periods_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='periods')
In [4]:
categories_df.head(2)
Out[4]:
In [5]:
# it auto-merges cat_idx in our original dataframe with cat_idx in categories_df
# it auto-merges dt_idx in our original dataframe with dt_idx in data_types_df
# it auto-merges geo_idx in our original dataframe with geo_idx in geo_levels_df
# it auto-merges per_idx in our original dataframe with per_idx in periods_df
df = data_df.merge(categories_df).merge(data_types_df).merge(geo_levels_df).merge(periods_df)
# We only want to look at the total number of homes sold across entire the united states
df = df[(df['cat_code'] == 'SOLD') & (df['geo_code'] == 'US') & (df['dt_code'] == 'TOTAL')]
In [6]:
# We don't merge error_types_df because all of the errors are the same
df['et_idx'].value_counts()
Out[6]:
In [7]:
df.head(2)
Out[7]:
In [8]:
# Now let's remove the join columns to keep things clean
df = df.drop(['per_idx', 'cat_idx', 'dt_idx', 'et_idx', 'geo_idx'], axis=1)
df.head()
Out[8]:
In [9]:
# At least we can see 'per_name' (period name) is already a datetime!
df.info()
In [10]:
df.head(3)
Out[10]:
It's the column on the far left - 0, 1, 2, 3, 4... boring and useless! If we replace the index with the datetime, though, we can start to have some fun
In [11]:
# First we move it over into the index column
df.index = df['per_name']
df.head(2)
Out[11]:
In [12]:
# Then we delete the per_name column because we don't need it any more...
del df['per_name']
df.head(2)
Out[12]:
In [13]:
# Everything in March, 1963
df['1963-3']
Out[13]:
In [14]:
# Everything in 2010
df['2010']
Out[14]:
In [15]:
# Make our list of fruits
ranked_fruits = ('banana', 'orange', 'apple', 'blueberries', 'strawberries')
In [16]:
# Start from the beginning, get the first two
ranked_fruits[:2]
Out[16]:
In [17]:
# Start from two, get up until the fourth element
ranked_fruits[2:4]
Out[17]:
In [18]:
# Starting from the third element, get all the rest
ranked_fruits[3:]
Out[18]:
Instead of using boring ol' numbers, we can use dates instead.
In [19]:
# Everything after 2001
df["2001":]
Out[19]:
In [20]:
# Everything between June 1990 and March 1995
df["1990-06":"1995-03"]
Out[20]:
In [21]:
df.plot(y='val')
Out[21]:
Hmmm, looks like something might have happened at some point. Maybe we want to see some numbers instead of a graph? To do aggregate statistics on time series in pandas we use a method called .resample(), and we're going to tell it to group the data by year.
In [22]:
# http://stackoverflow.com/a/17001474 gives us a list of what we can pass to 'resample'
df.resample('A').median()
Out[22]:
That still looks like too much data! What about every decade?
In [23]:
# If 'A' is every year, 10A is every 5 years
df.resample('5A').median()
Out[23]:
In [24]:
# We can graph these!
df.plot(y='val', label="Monthly")
df.resample('A').median().plot(y='val', label="Annual")
df.resample('10A').median().plot(y='val', label="Decade")
Out[24]:
In [25]:
# We can graph these ALL ON THE SAME PLOT!
# we store the 'ax' from the first .plot and pass it to the others
ax = df.plot(y='val', label="Monthly")
df.resample('A').median().plot(y='val', ax=ax, label="Annual")
df.resample('10A').median().plot(y='val', ax=ax, label="Decade")
Out[25]:
In [26]:
# Which year had the worst month?
df.resample('A').median()
Out[26]:
In [ ]:
In [ ]:
In [27]:
# Group by the month, check the median
df.groupby(by=df.index.month).median()
Out[27]:
In [28]:
# Group by the month, check the median, plot the results
df.groupby(by=df.index.month).median().plot(y='val')
Out[28]:
In [29]:
# Group by the month, check the median, plot the results
ax = df.groupby(by=df.index.month).median().plot(y='val', legend=False)
ax.set_xticks([1,2,3,4,5,6,7,8,9,10,11, 12])
ax.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
ax.set_ylabel("Houses sold (in thousands)")
ax.set_title("House sales by month, 1963-2016")
Out[29]:
In [30]:
df.resample('A')['val'].max().plot()
Out[30]:
In [31]:
# The fewest?
df.resample('A')['val'].min().plot()
Out[31]:
In [32]:
# We now know we can look at the range
ax = df.resample('A')['val'].median().plot()
df.resample('A')['val'].max().plot(ax=ax)
df.resample('A')['val'].min().plot(ax=ax)
Out[32]:
In [33]:
# We now know we can look at the range IN AN EVEN COOLER WAY
ax = df.resample('A')['val'].median().plot()
x_values = df.resample('A').index
min_values = df.resample('A')['val'].min()
max_values = df.resample('A')['val'].max()
ax.fill_between(x_values, min_values, max_values, alpha=0.5)
ax.set_ylim([0,130])
ax.set_ylabel("Houses sold (in thousands)")
ax.set_xlabel("Year")
ax.set_title("The Housing Bubble")
Out[33]:
In [ ]:
In [ ]:
In [ ]: